Check if there are any missing values in the training set and the submission set

any(is.na(train))
## [1] FALSE
any(is.na(submission))
## [1] FALSE

As both return FALSE, there are no missing values


Check for the structure of the dataset

glimpse(train)
## Observations: 36,168
## Variables: 17
## $ age       <int> 50, 47, 56, 36, 41, 32, 26, 60, 39, 55, 32, 30, 35, ...
## $ job       <fct> entrepreneur, technician, housemaid, blue-collar, ma...
## $ marital   <fct> married, married, married, married, married, single,...
## $ education <fct> primary, secondary, primary, primary, primary, terti...
## $ default   <fct> yes, no, no, no, no, no, no, no, no, no, no, no, no,...
## $ balance   <int> 537, -938, 605, 4608, 362, 0, 782, 193, 2140, 873, 0...
## $ housing   <fct> yes, yes, no, yes, yes, no, no, yes, yes, yes, no, y...
## $ loan      <fct> no, no, no, no, no, no, no, no, no, yes, no, no, no,...
## $ contact   <fct> unknown, unknown, cellular, cellular, cellular, cell...
## $ day       <int> 20, 28, 19, 14, 12, 4, 29, 12, 16, 3, 19, 27, 21, 8,...
## $ month     <fct> jun, may, aug, may, may, feb, jan, may, apr, jun, au...
## $ duration  <int> 11, 176, 207, 284, 217, 233, 297, 89, 539, 131, 103,...
## $ campaign  <int> 15, 2, 6, 7, 3, 3, 1, 2, 1, 1, 4, 3, 1, 2, 1, 8, 1, ...
## $ pdays     <int> -1, -1, -1, -1, -1, 276, -1, -1, -1, -1, -1, -1, -1,...
## $ previous  <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ poutcome  <fct> unknown, unknown, unknown, unknown, unknown, failure...
## $ y         <fct> no, no, no, no, no, yes, no, no, no, no, no, no, no,...
glimpse(submission)
## Observations: 9,043
## Variables: 16
## $ age       <int> 58, 43, 51, 56, 32, 54, 58, 54, 32, 38, 57, 51, 35, ...
## $ job       <fct> management, technician, retired, management, blue-co...
## $ marital   <fct> married, single, married, married, single, married, ...
## $ education <fct> tertiary, secondary, primary, tertiary, primary, sec...
## $ default   <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ balance   <int> 2143, 593, 229, 779, 23, 529, -364, 1291, 0, 424, 24...
## $ housing   <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ loan      <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes...
## $ contact   <fct> unknown, unknown, unknown, unknown, unknown, unknown...
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ month     <fct> may, may, may, may, may, may, may, may, may, may, ma...
## $ duration  <int> 261, 55, 353, 164, 160, 1492, 355, 266, 179, 104, 16...
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ poutcome  <fct> unknown, unknown, unknown, unknown, unknown, unknown...

Comparing to training set with the submission set, we can tell the target variable is y


Check for the factor levels of y

unique(train$y)
## [1] no  yes
## Levels: no yes

As shown, it returns only yes and no, so it’s a binary classification problem


Check for imbalance

target_df <- data.frame(table(train$y))
colnames(target_df) <- c("target", "count")
ggplotly( 
  ggplot(data=target_df, aes(x=target, y=count, fill=target)) +
    geom_bar(position = 'dodge', stat='identity', alpha=0.5) +
    scale_fill_manual("targe t", values = c("yes" = "dodgerblue", "no"="firebrick1")) +
    theme_classic()
)
nrow(subset(train, y =='yes')) / nrow(train)
## [1] 0.1159865

11% , no over/under-sampling needed


Numerical Features

As shown from the hists, most of the numerical features are heavily skewed, and need to be normalized


Categorical Features

As shown from the hists, most of the numerical features are heavily skewed, and need to be normalized


Correlation between two numerical features

train_num <- dplyr::select_if(train, is.numeric)
res <- cor(train_num)
corrplot.mixed(
          res,
          upper="circle",
          lower="number",
          tl.col = "black",
          number.cex = .8,
          tl.cex=.8)

As shown from the result, there are no strong correlations between most of the pairs

Except the one for pdays and previous = 0.54

data<- aggregate(train$age, by=list(job = train$job, target = train$y), FUN = mean)
data <- dcast(data, job ~ target)
## Using 'x' as value column. Use 'value.var' to override
rnames <- data[,1]
mat_data <- data.matrix(data[,2:ncol(data)])

rownames(mat_data) <- rnames
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)

my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
col_breaks = c(seq(-1,0,length=100), # for red
seq(0,0.8,length=100),  # for yellow
seq(0.81,1,length=100)) # for green

data
heatmap(mat_data,
  margins =c(12,12),     # widens margins around plot
    col=my_palette,       # use on color palette defined earlier
   breaks=col_breaks    # enable color transition at specified limits
)

data<- aggregate(train$age, by=list(education = train$education, target = train$y), FUN = mean)
data <- dcast(data, education ~ target)
## Using 'x' as value column. Use 'value.var' to override
rnames <- data[,1]
mat_data <- data.matrix(data[,2:ncol(data)])

rownames(mat_data) <- rnames
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)

my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
col_breaks = c(seq(-1,0,length=100), # for red
seq(0,0.8,length=100),  # for yellow
seq(0.81,1,length=100)) # for green

data
heatmap(mat_data,
  margins =c(12,12),     # widens margins around plot
    col=my_palette,       # use on color palette defined earlier
   breaks=col_breaks    # enable color transition at specified limits
)

# ggplotly(
  ggplot(train, aes(x=previous, y=pdays, color=poutcome)) +
    geom_point(aes(size = poutcome), alpha = 0.15 ) +
    scale_size_discrete(range = c(3, 7))+
    scale_colour_hue()+
    theme_classic()
## Warning: Using size for a discrete variable is not advised.

# )